From 1be1142d47bf7bfe16868be62805b7aedc866954 Mon Sep 17 00:00:00 2001 From: Daniel Sabo Date: Sun, 20 Dec 2015 03:14:19 -0800 Subject: [PATCH] Add SSE4.1 u8 -> float conversions --- configure.ac | 24 +++++ extensions/Makefile.am | 3 + extensions/sse4-int8.c | 218 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) create mode 100644 extensions/sse4-int8.c diff --git a/configure.ac b/configure.ac index 66ebc77..f09c7ac 100644 --- a/configure.ac +++ b/configure.ac @@ -299,10 +299,15 @@ AC_ARG_ENABLE(sse2, [ --enable-sse2 enable SSE2 support (default=auto)],, enable_sse2=$enable_sse) +AC_ARG_ENABLE(sse4_1, + [ --enable-sse4_1 enable SSE4_1 support (default=auto)],, + enable_sse4_1=$enable_sse) + if test "x$enable_mmx" = xyes; then BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx') SSE_EXTRA_CFLAGS= SSE2_EXTRA_CFLAGS= + SSE4_1_EXTRA_CFLAGS= AC_MSG_CHECKING(whether we can compile MMX code) @@ -353,6 +358,24 @@ if test "x$enable_mmx" = xyes; then AC_MSG_RESULT(no) AC_MSG_WARN([The assembler does not support the SSE2 command set.]) ) + + if test "x$enable_sse4_1" = xyes; then + BABL_DETECT_CFLAGS(sse4_1_flag, '-msse4.1') + SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $sse4_1_flag" + + AC_MSG_CHECKING(whether we can compile SSE4_1 code) + + CFLAGS="$CFLAGS $sse4_1_flag" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("pmovzxbd %xmm0,%xmm1");])], + AC_DEFINE(USE_SSE4_1, 1, [Define to 1 if SSE4_1 assembly is available.]) + AC_MSG_RESULT(yes) + , + enable_sse4_1=no + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the SSE4_1 command set.]) + ) + fi fi fi @@ -367,6 +390,7 @@ if test "x$enable_mmx" = xyes; then AC_SUBST(MMX_EXTRA_CFLAGS) AC_SUBST(SSE_EXTRA_CFLAGS) AC_SUBST(SSE2_EXTRA_CFLAGS) + AC_SUBST(SSE4_1_EXTRA_CFLAGS) fi diff --git a/extensions/Makefile.am b/extensions/Makefile.am index 4a3fb8a..cd7e893 100644 --- a/extensions/Makefile.am +++ b/extensions/Makefile.am @@ -31,6 +31,7 @@ ext_LTLIBRARIES = \ sse2-float.la \ sse2-int8.la \ sse2-int16.la \ + sse4-int8.la \ two-table.la \ ycbcr.la @@ -48,6 +49,7 @@ HSV_la_SOURCES = HSV.c sse2_float_la_SOURCES = sse2-float.c sse2_int8_la_SOURCES = sse2-int8.c sse2_int16_la_SOURCES = sse2-int16.c +sse4_int8_la_SOURCES = sse4-int8.c two_table_la_SOURCES = two-table.c two-table-tables.h ycbcr_la_SOURCES = ycbcr.c float_la_SOURCES = float.c @@ -59,3 +61,4 @@ LIBS = $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la $(MATH_LIB) \ sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) +sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) diff --git a/extensions/sse4-int8.c b/extensions/sse4-int8.c new file mode 100644 index 0000000..73f63e3 --- /dev/null +++ b/extensions/sse4-int8.c @@ -0,0 +1,218 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2013 Daniel Sabo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(USE_SSE4_1) + +/* SSE 4 */ +#include + +#include +#include + +#include "babl.h" +#include "babl-cpuaccel.h" +#include "extensions/util.h" + +static inline long +conv_y8_yF (const uint8_t *src, float *dst, long samples) +{ + const float factor = 1.0f / 255.0f; + const __v4sf factor_vec = {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}; + const uint32_t *s_vec; + __v4sf *d_vec; + + long n = samples; + + s_vec = (const uint32_t *)src; + d_vec = (__v4sf *)dst; + + while (n >= 4) + { + __m128i in_val; + __v4sf out_val; + in_val = _mm_insert_epi32 (in_val, *s_vec++, 0); + in_val = _mm_cvtepu8_epi32 (in_val); + out_val = _mm_cvtepi32_ps (in_val) * factor_vec; + _mm_storeu_ps ((float *)d_vec++, out_val); + n -= 4; + } + + src = (const uint8_t *)s_vec; + dst = (float *)d_vec; + + while (n) + { + *dst++ = (float)(*src++) * factor; + n -= 1; + } + + return samples; +} + +static long +conv_ya8_yaF (const uint8_t *src, float *dst, long samples) +{ + return conv_y8_yF (src, dst, samples * 2) / 2; +} + +static long +conv_rgb8_rgbF (const uint8_t *src, float *dst, long samples) +{ + return conv_y8_yF (src, dst, samples * 3) / 3; +} + +static long +conv_rgba8_rgbaF (const uint8_t *src, float *dst, long samples) +{ + return conv_y8_yF (src, dst, samples * 4) / 4; +} + +#endif + +int init (void); + +int +init (void) +{ +#if defined(USE_SSE4_1) + const Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgba8_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbaF_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgba8_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("u8"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgbF_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgb8_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgbF_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *rgb8_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("u8"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *yaF_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("float"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *ya8_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("u8"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *yaF_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("float"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *ya8_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("u8"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *yF_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("float"), + babl_component ("Y"), + NULL); + const Babl *y8_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("u8"), + babl_component ("Y"), + NULL); + const Babl *yF_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("float"), + babl_component ("Y'"), + NULL); + const Babl *y8_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("u8"), + babl_component ("Y'"), + NULL); + +#define CONV(src, dst) \ +{ \ + babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \ + babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \ +} + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1)) + { + CONV(rgba8, rgbaF); + CONV(rgb8, rgbF); + CONV(ya8, yaF); + CONV(y8, yF); + } + +#endif + + return 0; +} + -- 2.30.2